
rm(list = ls())

library(tidyverse)
library(readxl)
library(lubridate)

moscow_raw <- read_excel("f_MoscowDataRaw.xlsx")


moscow_dta_temp <- moscow_raw %>%
  select(start_time = V8, end_time = V9, randomid = RandomID,
         matches("S1|S2", ignore.case = FALSE),
         matches("G1|G2|G3", ignore.case = FALSE),
         contains("lottery"), contains("PSM"), contains("big5"),
         matches("job_sect|job_pref|job_likely|job_qualities"),
         bribe_nonexp = dishonesty_4,
         age, male = gender, class_yr, contains("dep"),
         gpa = GPA, olympiad, contains("EGE"), region,
         city_size, contains("religion"), family_income,
         contains("relatives"), 
         soc_science_knowledg, device
  )






#PRELIMINARY VARIABLE CONSTRUCTION FOR ACADEMIC DEPARTMENT

moscow_dta_temp <- moscow_dta_temp %>% mutate(dep_text = case_when(
                     grepl("бизнес|БИ|Логист|безопас|информ|сратег|менедж|бм|финансового|управлен", dep_TEXT, ignore.case = T) ~ "Bus",
                     grepl("между|миров|МЭ|МИЭФ", dep_TEXT, ignore.case = T) ~ "Intl",
                     grepl("экон|РЭШ|стат|финансов", dep_TEXT, ignore.case = T) ~ "Econ",
                     grepl("интел|комп|Програм|ФКН", dep_TEXT, ignore.case = T) ~ "CS",
                     grepl("ВШУ|урбан", dep_TEXT, ignore.case = T) ~ "Urban",
                     grepl("гуман|ФГН|истор|Линг|филос", dep_TEXT, ignore.case = T) ~ "Humanities",
                     grepl("матем|Мвтем|МИЭМ|ДПМ", dep_TEXT, ignore.case = T) ~ "Math",
                     grepl("комм|медиа|дизайн|журнал|реклам", dep_TEXT, ignore.case = T) ~ "Comm",
                     grepl("прав|юрис", dep_TEXT, ignore.case = T) ~ "Law",
                     grepl("Искусства|УиЭЗ|нет деп|-", dep_TEXT, ignore.case = T) ~ "Other")
                      )

#OTHER PRELIMINARY CLEANING

moscow_dta_temp <- moscow_dta_temp %>% mutate_at(vars(contains("_work_")), list(~replace_na(., 0)))



#CREATING DATA SET

moscow_dta <- moscow_dta_temp %>% transmute(
                  #DICTATOR GAME
                  donate = G1_donate_1, 
                  #converts to ruble amount
                  donate50 = donate*50,
                  #RISK AVERSION MEASURE
                  risk_averse1 = 8 - (lottery_1 + lottery_2 + lottery_3 + lottery_4 + lottery_5 + lottery_6 + lottery_7),
                  #this version removes subjects with non-transitive preferences
                  risk_averse2 = ifelse((lottery_1 <= lottery_2 & 
                                           lottery_2 <= lottery_3 & 
                                           lottery_3 <= lottery_4 &
                                           lottery_4 <= lottery_5 &
                                           lottery_5 <= lottery_6 & 
                                           lottery_6 <= lottery_7) == TRUE, risk_averse1, NA),
                  #CAREER PREFERENCE VARIABLES
                  job_sect,
                  fed_pref = job_pref_1,
                  reg_pref = job_pref_2,
                  corp_pref = job_pref_3,
                  sme_pref = job_pref_4,
                  owner_pref = job_pref_5,
                  finance_pref = job_pref_6,
                  consult_pref = job_pref_7,
                  ngo_pref = job_pref_8,
                  gov_pref = job_pref_9,
                  fed_exp = job_likely_1,
                  reg_exp = job_likely_2,
                  corp_exp = job_likely_3,
                  sme_exp = job_likely_4,
                  owner_exp = job_likely_5,
                  finance_exp = job_likely_6,
                  consult_exp = job_likely_7,
                  ngo_exp = job_likely_8,
                  gov_exp = job_likely_9,
                  pub_avg = (fed_pref + reg_pref + gov_pref)/3,
                  priv_avg = (corp_pref + finance_pref + sme_pref + consult_pref + owner_pref)/5,
                  #JOB ATTRIBUTES
                  job_stable = job_qualities_1,
                  job_income = job_qualities_2,
                  job_bens = job_qualities_3,
                  job_promo = job_qualities_4,
                  job_interest = job_qualities_5,
                  job_altruism = job_qualities_6,
                  job_social = job_qualities_7,
                  job_schedule = job_qualities_8,
                  job_connect = job_qualities_9,
                  job_prestige = job_qualities_10,
                  job_intrinsic = (job_social + job_altruism + job_interest)/3,
                  job_extrinsic = (job_income + job_promo + job_connect + job_prestige)/4,
                  job_pragmatic = (job_stable + job_bens + job_schedule)/3,
                  #JUSTIFYING BRIBERY
                  bribe_nonexp, 
                  bribe_nonexp_dich = ifelse(bribe_nonexp < 3, 0, 1),
                  #PSM
                  aps1 = PSM_1, aps2 = PSM_2, aps3 = PSM_3, aps4 = PSM_4,
                  cpv1 = PSM_5, cpv2 = PSM_6, cpv3 = PSM_7, cpv4 = PSM_8,
                  com1 = PSM_9, com2 = PSM_10, com3 = PSM_11, com4 = PSM_12,
                  ss1 = PSM_13, ss2 = PSM_14, ss3 = PSM_15, ss4 = PSM_16,
                  aps = (aps1 + aps2 + aps3 + aps4)/4,
                  cpv = (cpv1 + cpv2 + cpv3 + cpv4)/4,
                  com = (com1 + com2 + com3 + com4)/4,
                  ss = (ss1 + ss2 + ss3 + ss4)/4,
                  psm = (aps + cpv + com + ss)/4,
                  #CONTROL VARIABLES
                  age, male, olympiad, gpa, family_income,
                  city_size, religion, religion_services,
                  religion_denom = recode(religion_denom, `1` = "Orthodox Christianity", `2` =  "Protestantism",
                                          `3` = "Catholicism", `4` = "Islam", `5` = "Judaism", `6` = "Buddhism",
                                          `7` = "Other"),
                  ege_taken = EGE_taken,
                  ege_math = as.numeric(EGE_3_TEXT),
                  ege_ru = na_if(EGE_10_TEXT, "-"),
                  ege_ru = as.numeric(ege_ru),
                  ege_avg = (ege_math + ege_ru)/2,
                  region1 = recode(region, `1` = "Moscow", `44` = "Moscow", `99` = "Foreign", .default = "Regions"),
                  region2 = plyr::mapvalues(region, 
                                            from = c(3, 8, 14, 25, 34, 57, #southern
                                                     1, 10, 11, 13, 16, 20, 26, 38, 40, 44, 52, 58, 66, 68, 70, 72, 85,#central
                                                     9, 31, 43, 47, 51, 53, 54, 59, 60, 75, 76, 82, 69,#volga
                                                     65, 67, #n caucasus
                                                     36, #crimea
                                                     2, 7, 15, 24, 29, 32, 48, #northwest
                                                     63, 74, 79, 80, 84, #ural
                                                     5, 12, 22, 30, 35, 49, 50, 73, 78, #siberia
                                                     6, 27, 41, 55, 62, 77, 61, #far east
                                                     99),#foreign 
                                            to = c(rep("Southern", 6), rep("Central", 17), rep("Volga", 13), rep("N. Caucasus", 2),
                                                   rep("Crimea", 1), rep("Northwest", 7), rep("Ural", 5), rep("Siberia", 9), 
                                                   rep("Far East", 7), rep("Foreign", 1))),
                  f_work_fed = relatives_work_1_1, m_work_fed = relatives_work_1_2, r_work_fed = relatives_work_1_3,
                  f_work_reg = relatives_work_2_1, m_work_reg = relatives_work_2_2, r_work_reg = relatives_work_2_3,
                  f_work_gov = relatives_work_3_1, m_work_gov = relatives_work_3_2, r_work_gov = relatives_work_3_3,
                  f_work_corp = relatives_work_4_1, m_work_corp = relatives_work_4_2, r_work_corp = relatives_work_4_3,
                  f_work_sme = relatives_work_5_1, m_work_sme = relatives_work_5_2, r_work_sme = relatives_work_5_3,
                  f_work_owner = relatives_work_6_1, m_work_owner = relatives_work_6_2, r_work_owner = relatives_work_6_3,
                  f_work_finance = relatives_work_7_1, m_work_finance = relatives_work_7_2, r_work_finance = relatives_work_7_3,
                  f_work_ngo = relatives_work_8_1, m_work_ngo = relatives_work_8_2, r_work_ngo = relatives_work_8_3,
                  f_work_law = relatives_work_9_1, m_work_law = relatives_work_9_2, r_work_law = relatives_work_9_3,
                  f_work_mil = relatives_work_10_1, m_work_mil = relatives_work_10_2, r_work_mil = relatives_work_10_3,
                  f_work_other = relatives_work_11_1, m_work_other = relatives_work_11_2, r_work_other = relatives_work_11_3,
                  f_no_work = relatives_work_12_1, m_no_work = relatives_work_12_2, r_no_work = relatives_work_12_3,
                  f_work_na = relatives_work_13_1, m_work_na = relatives_work_13_2, r_work_na = relatives_work_13_3,
                  f_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1, 1, 0),
                  m_pubsect = ifelse(m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0),
                  r_pubsect = ifelse(r_work_fed == 1 | r_work_reg == 1 | r_work_gov == 1, 1, 0),
                  rel_pubsect = ifelse(f_pubsect == 1 | m_pubsect == 1 | r_pubsect == 1, 1, 0),
                  #collapsing all MA students together
                  class_yr = recode(class_yr, `6` = 5),
                  #integrating write-in info on departments
                  dep = ifelse(dep == 5 & is.na(dep_text) == FALSE, dep_text, dep),
                  dep = recode_factor(dep, `1` = "PA", `2` = "PS", `3` = "Soc", `4` = "Psych", `5` = "Other"),
                  dep2 = recode_factor(dep, "Law" = "Other", "Humanities" = "Other", "CS" = "Other", "Math" = "Other", "Intl" = "Other", "Urban" = "Other", "Psych" = "Other"),
                  #QUALITY CHECK VARIABLES
                  randomid, soc_sci_knowledge = soc_science_knowledg, device,
                  #time to complete
                  duration = round(difftime(end_time, start_time, units=c("auto","secs","mins","hours","days","weeks")), 0),
                  #screeners
                  screener1 = ifelse(is.na(S1_27) == F & is.na(S1_29) == F, 1, 0),
                  screener2 = ifelse(is.na(S2_5) == F & is.na(S2_10) == F, 1, 0),
                  screener = screener1 + screener2
                  )
  

##MERGING FOLLOW UP SURVEYS

moscow17raw <- read_excel("g_MoscowFollowUp2017.xlsx")
moscow18raw <- read_excel("h_MoscowFollowUp2018.xlsx")
moscow19raw <- read_excel("i_MoscowFollowUp2019.xlsx")

moscow19 <- moscow19raw %>% select(Q1, RandomID, contains("PSM"))
moscow18 <- moscow18raw %>% select(Q1, RandomID)
moscow17 <- moscow17raw %>% select(Q1, RandomID)

moscow19 <- moscow19 %>% mutate(RandomID = as.numeric(RandomID))
moscow19 <- moscow19 %>% rename(randomid = "RandomID", career19 = "Q1",
            aps1_2019 = "PSM_1", aps2_2019 = "PSM_2", aps3_2019 = "PSM_3", aps4_2019 = "PSM_4",
            cpv1_2019 = "PSM_5", cpv2_2019 = "PSM_6", cpv3_2019 = "PSM_7", cpv4_2019 = "PSM_8",
            com1_2019 = "PSM_9",  com2_2019 = "PSM_10",  com3_2019 = "PSM_11",  com4_2019 = "PSM_12",
            ss1_2019 = "PSM_13", ss2_2019 = "PSM_14", ss3_2019 = "PSM_15", ss4_2019 = "PSM_16")            
moscow18 <- moscow18 %>% rename(randomid = "RandomID", career18 = "Q1")
moscow17 <- moscow17 %>% rename(randomid = "RandomID", career17 = "Q1")

#checking for duplicate participant IDs
moscow_dta %>% count(randomid) %>% filter(n > 1)
#these two IDs have duplicates 29566, 80655    

moscow19 %>% count(randomid) %>% filter(n > 1)
moscow18 %>% count(randomid) %>% filter(n > 1)
moscow17 %>% count(randomid) %>% filter(n > 1)

#checking for IDs duplicated in original in follow up surveys
moscow19 %>% filter(randomid == 29566, randomid == 80655)
moscow18 %>% filter(randomid == 29566, randomid == 80655)
moscow17 %>% filter(randomid == 29566, randomid == 80655)
#neither of the problematic RandomIDs are in the follow up sample

#merging
moscow_dta_clean_temp1 <- moscow_dta %>% left_join(moscow19, by = "randomid")  
moscow_dta_clean_temp2 <- moscow_dta_clean_temp1 %>% left_join(moscow18, by = "randomid")  
moscow_dta_clean_temp3 <- moscow_dta_clean_temp2 %>% left_join(moscow17, by = "randomid")  



MoscowClean <- moscow_dta_clean_temp3 %>% mutate(
                                  #removing participants who report post-graduate studies or not working
                                  career_ifwork19 = ifelse(career19 %in% c(10, 12), NA, career19),
                                  #creating variable for whether employed in any public sector position
                                  pub19 = ifelse(career_ifwork19 == 1 | career_ifwork19 == 2 | career_ifwork19 == 3, 1, 0), 
                                  #creating variable for whether employed in any public sector position or ngo
                                  pub_ngo19 = ifelse(career_ifwork19 == 1 | career_ifwork19 == 2 | career_ifwork19 == 3 | career_ifwork19 == 9, 1, 0),
                                  #creating variable for whether employed in budget sector
                                  gov19 = ifelse(career_ifwork19 == 3, 1, 0),
                                  #creating similar variables for 2017 and 2018 follow up surveys 
                                  career_ifwork18 = ifelse(career18 %in% c(10, 12), NA, career18),
                                  pub18 = ifelse(career_ifwork18 == 1 | career_ifwork18 == 2 | career_ifwork18 == 3, 1, 0), 
                                  pub_ngo18 = ifelse(career_ifwork18 == 1 | career_ifwork18 == 2 | career_ifwork18 == 3 | career_ifwork18 == 9, 1, 0),
                                  gov18 = ifelse(career_ifwork18 == 3, 1, 0),
                                  career_ifwork17 = ifelse(career17 %in% c(10, 12), NA, career17),
                                  pub17 = ifelse(career_ifwork17 == 1 | career_ifwork17 == 2 | career_ifwork17 == 3, 1, 0), 
                                  pub_ngo17 = ifelse(career_ifwork17 == 1 | career_ifwork17 == 2 | career_ifwork17 == 3 | career_ifwork17 == 9, 1, 0),
                                  gov17 = ifelse(career_ifwork17 == 3, 1, 0),
                                  #combining 2019, 2018 and 2017 data to use the latest type of employment
                                  pub = ifelse(is.na(pub19) == T & is.na(pub18) == T, pub17, 
                                                  ifelse(is.na(pub19) == T, pub18, pub19)),
                                  pub_ngo = ifelse(is.na(pub_ngo19) == T & is.na(pub_ngo18) == T, pub_ngo17, 
                                               ifelse(is.na(pub_ngo19) == T, pub_ngo18, pub_ngo19)),
                                  gov = ifelse(is.na(gov19) == T & is.na(gov18) == T, gov17, 
                                               ifelse(is.na(gov19) == T, gov18, gov19)),
                                  #indicators for participation in panel
                                  panel2017 = ifelse(is.na(career17) == F, 1, 0),
                                  panel2018 = ifelse(is.na(career18) == F, 1, 0),
                                  panel2019 = ifelse(is.na(career19) == F, 1, 0),
                                  panel = ifelse(panel2017 == 0 & panel2018 == 0 & panel2019 == 0, 0, 1),
                                  panel2017_2019 = ifelse(is.na(career17) == F & is.na(career18) == F & is.na(career19) == F, 1, 0),
                                  panel2017_2018 = ifelse(is.na(career17) == F & is.na(career18) == F, 1, 0),
                                  panel2018_2019 = ifelse(is.na(career18) == F & is.na(career19) == F, 1, 0),
                                  #PSM index
                                  aps2019 = (aps1_2019 + aps2_2019 + aps3_2019 + aps4_2019)/4,
                                  cpv2019 = (cpv1_2019 + cpv2_2019 + cpv3_2019 + cpv4_2019)/4,
                                  com2019 = (com1_2019 + com2_2019 + com3_2019 + com4_2019)/4,
                                  ss2019  = (ss1_2019 + ss2_2019 + ss3_2019 + ss4_2019)/4,
                                  psm2019 = (aps2019 + cpv2019 + com2019 + ss2019)/4,
                                  psm_change = psm2019 - psm
                                  )
#converting tibble to dataframe
MoscowClean_DF <- as.data.frame(MoscowClean)

#write.csv(MoscowClean_DF, "MoscowClean.csv", na = "", row.names = F)


